Dependencies

library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)

set.seed(1)

Data

airbubbles <- read_csv("Data/libraries_FC100/Airbubbles_library_FC100.csv")
airbubbles$species <- "airbubbles"

Chlamydomonas <- read_csv("Data/libraries_FC100/Chlamydomonas_library_FC100.csv")
Chlamydomonas$species <- "Chlamydomonas"

ChlamydomonasClumps <- read_csv("Data/libraries_FC100/ChlamydomonasClumps_library_FC100.csv")
ChlamydomonasClumps$species <- "ChlamydomonasClumps"

ColepsIrchel <- read_csv("Data/libraries_FC100/ColepsIrchel_library_FC100.csv")
ColepsIrchel$species <- "Coleps_irchel"

ColepsViridis <- read_csv("Data/libraries_FC100/ColepsViridis_library_FC100.csv")
ColepsViridis$species <- "Coleps_viridis"

Colpidium <- read_csv("Data/libraries_FC100/Colpidium_library_FC100.csv")
Colpidium$species <- "Colpidium"

ColpidiumVacuoles <- read_csv("Data/libraries_FC100/ColpidiumVacuoles_library_FC100.csv")
ColpidiumVacuoles$species <- "ColpidiumVacuoles"

Cosmarium <- read_csv("Data/libraries_FC100/Cosmarium_library_FC100.csv")
Cosmarium$species <- "Cosmarium"

Cryptomonas <- read_csv("Data/libraries_FC100/Cryptomonas_library_FC100.csv")
Cryptomonas$species <- "Cryptomonas"

Debris <- read_csv("Data/libraries_FC100/Debris_library_FC100.csv")
Debris$species <- "Debris"

Desmodesmus <- read_csv("Data/libraries_FC100/Desmodesmus_library_FC100.csv")
Desmodesmus$species <- "Desmodesmus"

Dexiostoma <- read_csv("Data/libraries_FC100/Dexiostoma_library_FC100.csv")
Dexiostoma$species <- "Dexiostoma"

Euplotes <- read_csv("Data/libraries_FC100/Euplotes_library_FC100.csv")
Euplotes$species <- "OtherCiliate"

Loxocephallus <- read_csv("Data/libraries_FC100/Loxocephallus_library_FC100.csv")
Loxocephallus$species <- "Loxocephallus"

Monoraphidium <- read_csv("Data/libraries_FC100/Monoraphidium_library_FC100.csv")
Monoraphidium$species <- "Monoraphidium"

ParameciumBursaria <- read_csv("Data/libraries_FC100/ParameciumBursaria_library_FC100.csv")
ParameciumBursaria$species <- "OtherCiliate"

ParameciumCaudatum <- read_csv("Data/libraries_FC100/ParameciumCaudatum_library_FC100.csv")
ParameciumCaudatum$species <- "OtherCiliate"

Staurastrum1 <- read_csv("Data/libraries_FC100/Staurastrum1_library_FC100.csv")
Staurastrum1$species <- "Staurastrum1"

Staurastrum2 <- read_csv("Data/libraries_FC100/Staurastrum2_library_FC100.csv")
Staurastrum2$species <- "Staurastrum2"

Stylonychia1 <- read_csv("Data/libraries_FC100/Stylonychia1_library_FC100.csv")
Stylonychia1$species <- "OtherCiliate"

Stylonychia2 <- read_csv("Data/libraries_FC100/Stylonychia2_library_FC100.csv")
Stylonychia2$species <- "OtherCiliate"

Tetrahymena <- read_csv("Data/libraries_FC100/Tetrahymena_library_FC100.csv")
Tetrahymena$species <- "Tetrahymena"


colnames <- c("Particle_ID","Area_ABD","Area_Filled","Aspect_Ratio","Average_Blue",
              "Average_Green","Average_Red","Calibration_Factor","Calibration_Image",
              "Camera","Capture_X","Capture_Y","Ch1_Area","Ch1_Peak","Ch1_Width",
              "Ch2_Area","Ch2_Peak","Ch2_Width","Ch2_Ch1_Ratio","Circle_Fit",
              "Circularity","Circularity_Hu","Compactness","Convex_Perimeter",
              "Convexity","Date","Diameter_ABD","Diameter_ESD","Edge_Gradient",
              "Elongation","Feret_Angle_Max","Feret_Angle_Min","Fiber_Curl",
              "Fiber_Straightness","Filter_Score","Geodesic_Aspect_Ratio",
              "Geodesic_Length","Geodesic_Thickness","Image_File","Image_Height",
              "Image_Width","Image_X","Image_Y","Intensity","Length",
              "Particles_Per_Chain","Perimeter","Ratio_Blue_Green","Ratio_Red_Blue",
              "Ratio_Red_Green","Roughness","Scatter_Area","Scatter_Peak","Scatter_Width",
              "Sigma_Intensity","Source_Image","Sum_Intensity","Symmetry","Time",
              "Timestamp","Transparency","Volume_ABD","Volume_ESD","Width","species")

dd_all <- rbind(airbubbles,Chlamydomonas,ChlamydomonasClumps,ColepsIrchel,ColepsViridis,
                Colpidium,ColpidiumVacuoles,Cosmarium,Cryptomonas,Debris,Desmodesmus,
                Dexiostoma,Euplotes,Loxocephallus,Monoraphidium,ParameciumBursaria,
                ParameciumCaudatum,Staurastrum1,Staurastrum2,Stylonychia1,Stylonychia2,
                Tetrahymena)

colnames(dd_all) <- colnames
table(dd_all$species)
## 
##          airbubbles       Chlamydomonas ChlamydomonasClumps       Coleps_irchel 
##                 112                1099                 359                 475 
##      Coleps_viridis           Colpidium   ColpidiumVacuoles           Cosmarium 
##                  95                 101                 264                 341 
##         Cryptomonas              Debris         Desmodesmus          Dexiostoma 
##                 781                 339                1051                 160 
##       Loxocephallus       Monoraphidium        OtherCiliate        Staurastrum1 
##                 224                1489                 164                 510 
##        Staurastrum2         Tetrahymena 
##                  75                 108

Species compositions

species <- unique(dd_all$species)
species <- species[!is.element(species,c("airbubbles","ColpidiumVacuoles","Debris","OtherCiliate","ChlamydomonasClumps"))]
compositions <- read_csv("../compositions.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   composition = col_character()
## )
## See spec(...) for full column specifications.
compositions <- compositions[,species]

compositions_list <- apply(compositions, 1, function(x){
  idx <- which(x==1)
  names(idx)
})

names(compositions_list) <- paste0("c_",c(paste0(0,1:9),10:16))

Classifiers

classifier_plot_svm <- function(table, combination.nr){
  # cm <- classifier$confusion
  cm <- table
  ncol <- ncol(cm)
  cm <- apply(cm, 1, function(x) x/sum(x))
  
  cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
                        Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
                        Classification = as.vector(cm))
  
  plot <- cm_long %>%
    ggplot(aes(Predicted,Truth,fill=Classification))+
    geom_tile() +
    geom_text(aes(label = round(Classification, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
    scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
    theme(axis.text.x = element_text(angle = 90, hjust = 0))+
    theme(legend.text = element_text(size=14),
          axis.title = element_text(size=14),
          title = element_text(size=16),
          axis.text = element_text(size=13))+
    labs(title=paste("PPV of",combination.nr),fill="PPV")
  
  return(plot)
}


classifier_assessment_plot <- function(cf, combination.nr){

  cf.df <- cf$byClass[,1:4] %>%
    as.data.frame() %>%
    mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
    rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
           "Sens" = "Sensitivity", "Spec" = "Specificity") %>%
    pivot_longer(cols = 1:4) %>%
    mutate(col = ifelse(value>=0.9,"1",
                        ifelse(value<0.9 & value>=0.8,"2",
                               ifelse(value<0.8 & value>=0.7,"3","4"))))

  plot <- cf.df %>%
    ggplot(aes(name,Species,fill=col))+
    geom_tile() +
    geom_text(aes(label = round(value, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
    scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
    theme(legend.text = element_text(size=14),
          title = element_text(size = 16),
          axis.title = element_blank(),
          axis.text = element_text(size=13),
          legend.position = "none")+
    labs(title=combination.nr, fill="")
  
  return(plot)
}
formula <- factor(species) ~ Area_ABD + Area_Filled + Aspect_Ratio +
      Average_Blue + Average_Green + Average_Red + Circle_Fit +
      Circularity + Compactness +
      Convexity + Diameter_ABD + Diameter_ESD + Edge_Gradient +
      Elongation + Feret_Angle_Max + Feret_Angle_Min + Fiber_Curl +
      Fiber_Straightness + Geodesic_Aspect_Ratio + Intensity + Length +
      Perimeter + Ratio_Blue_Green + Ratio_Red_Blue + Ratio_Red_Green +
      Roughness + Sigma_Intensity + Sum_Intensity + Symmetry + Transparency +
      Width

set.seed(62)
classifiers_18c <- list()
classifiers_18c_plots <- list()
classifiers_18c_assess_plots <- list()

trainingdata <- dd_all[complete.cases(dd_all), ]


for(i in 1:length(compositions_list)){
  
  if("Colpidium" %in% compositions_list[[i]]) {
    sub_trainingdata <- trainingdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","OtherCiliate","ChlamydomonasClumps"))
  } else {
    sub_trainingdata <- trainingdata %>%
      filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps"))
  }

  n.var <- length(unique(sub_trainingdata$species))
  sub_trainingdata$species <- factor(sub_trainingdata$species)
  
  split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
  subsamples <- lapply(split_up, function(x) {
    x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
  sub_trainingdata <- do.call("rbind", subsamples)
  
  # table(sub_trainingdata$species)
  # A stratified random split of the data
  idx_train <- createDataPartition(sub_trainingdata$species,
                                   p = 0.7, # percentage of data as training
                                   list = FALSE)
  
  sub_testdata <- sub_trainingdata[-idx_train,]
  sub_trainingdata <- sub_trainingdata[idx_train,]
  
  n.min <- min(table(sub_trainingdata$species))
  
  obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))
  
  classifiers_18c[[i]] <- obj$best.model
  
  cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                       sub_testdata$species)
  
  classifiers_18c_plots[[i]] <- classifier_plot_svm(table = cf$table,
                                                        combination.nr = names(compositions_list)[[i]])
  
  classifiers_18c_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
names(classifiers_18c_plots) <- names(classifiers_18c) <-
  names(classifiers_18c_assess_plots) <- paste0("c_",c(paste0(0,1:9),10:16))
library(patchwork)
for(i in 1:16){
  print(classifiers_18c_plots[[i]] + classifiers_18c_assess_plots[[i]] + 
    plot_layout(widths = c(7,3)))
}

Save classifiers

saveRDS(classifiers_18c, "svm_flowcam_classifiers_18c.rds")
# cl <- readRDS("classifiers_18c_25x.rds")
# labels <- dd_all %>% 
#   group_by(species) %>%  
#   summarise(xPos = max(Area_Filled),
#             yPos = max((density(Area_Filled))$y))
# 
# library(directlabels)
# direct.label(qplot(Area_Filled,data=dd_all,colour=species,geom="density",log="x"))